package com.dhgate.pic.craw.pageProcessor;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.utils.UrlUtils;

import java.util.List;

import com.google.common.collect.Sets;

public class ImgProcessor implements PageProcessor {

	private String urlPattern;

	private Site site;

	public ImgProcessor(String startUrl, String urlPattern) {
		this.site = Site.me().setDomain(UrlUtils.getDomain(startUrl)).setCharset("GBK");
		/**
		 * 设置如果出现500错误忽略
		 */
		//site.setAcceptStatCode(Sets.newHashSet(500));
		this.urlPattern = urlPattern;
	}

	@Override
	public void process(Page page) {

		String imgRegex = "http://www.meizitu.com/wp-content/uploads/20[0-9]{2}[a-z]/[0-9]{1,4}/[0-9]{1,4}/[0-9]{1,4}.jpg";
		List<String> requests = page.getHtml().links().regex(urlPattern).all();
		page.addTargetRequests(requests);
		String imgHostFileName = page.getHtml().xpath("//title/text()")
				.toString().replaceAll("[|\\pP‘’“”\\s(妹子图)]", "");
		List<String> listProcess = page.getHtml().$("div#picture")
				.regex(imgRegex).all();
		// 此处将标题一并抓取，之后提取出来作为文件名
		listProcess.add(0, imgHostFileName);
		page.putField("img", listProcess);

	}

	@Override
	public Site getSite() {
		return site;
	}
}